Utilities codes


In [1]:
def describe(table,include='all',showNullIndexes=False):
    import numpy as np
    print table.describe(include=include)
    print "\n-> Has Nil? (How many?)"
    has_nil = table.isnull().apply(np.sum)
    print has_nil
    if showNullIndexes:
        for c in has_nil.index:
            if not has_nil[c]: continue
            print "\n-> Indexes where column '{}' is null:".format(c)
            print table[table[c].isnull()].index.values

Plotting codes

This first cell(s) is(are) dedicated to place the codes used throught the notebook. Reason for that is just organization and clarity.


In [2]:
import matplotlib
class PlotDistro:
    @staticmethod
    def boxhist(data,column,**kw):
        from matplotlib import pyplot as plt
        fig,axs = plt.subplots(1,2)
        fig.set_size_inches(15,5)
        
        data[column].plot.box(ax=axs[0])

        bins = kw['bins'] if kw.has_key('bins') else 10
        log = kw['log'] if kw.has_key('log') else False
        data[column].plot.hist(ax=axs[1],bins=bins,log=log)
        
        return plt

In [3]:
import numpy

def histogram2stepfunction(hist,bins):
    hist = hist.tolist()
    hh = hist+hist
    hh[::2] = hist
    hh[1::2] = hist
    hist = hh[:]
    bins = bins.tolist()
    bb = bins[:-1]+bins[1:]
    bb.sort()
    bins = bb[:]
    assert len(hist)==len(bins)
    return hist,bins

class Distro:
    
    @staticmethod
    def binning(vector,nbins,xmin=None,xmax=None,spacing='linear'):
        """
        """
        import numpy as np
        spacing_function = {'linear' : np.linspace}

        if spacing is not 'linear':
            spacing = 'linear'

        xmin = vector.min() if xmin is None else xmin
        xmax = vector.max() if xmax is None else xmax
        nbins = 10 if not (nbins >= 1) else nbins

        bins = spacing_function[spacing](xmin,xmax,nbins)
        return bins

        
    @staticmethod
    def histogram(vector,bins):
        """
        """
        import numpy as np

        h,b = np.histogram(vector,bins=bins,normed=False)
        assert np.array_equal(b,bins)
        return h,b

In [4]:
class Colors:
    # From https://en.wikipedia.org/wiki/Web_colors
    rgb = {
        'blueish' :   ['#0000FF',
                       '#9933FF',
                       '#0066FF'],
        'greenish' :  ['#00FF00',
                       '#66CC00',
                       '#009900'],
        'redish' :    ['#FF0000',
                       '#993300',
                       '#FF6600'],
        'yellowish' : ['#FFFF00',
                       '#CCCC00',
                       '#999900']
    }
    mono = {
        'grayish' :   ['#000000'
                      '#808080'
                      '#FFFFFF']
    }
    #blue = blueish[0]
    #red = redish[0]
    #green = greenish[0]
    #yellow = yellowish[0]
    
    @classmethod
    def get_colors(this,N,mode='sparse'):
        """
        mode = sparse
        """
        mode = mode if mode is 'sparse' else 'sparse'
        from collections import OrderedDict
        if mode is 'sparse':
            groups = this.rgb.keys()
            groups.sort()
            color_groups = OrderedDict()
            for g in groups:
                color_groups[g] = this.rgb.get(g)[:]
            
        #TODO: make this selection better, probably objectfying each color group
        colors = []
        while len(colors)<N:
            for _g in color_groups.keys():
                try:
                    colors.append(color_groups[_g].pop(0))
                except IndexError,e:
                    raise IndexError,"You're asking me {0} colors, more than I have to offer.".format(N)
                if len(colors) == N:
                       break
        return colors

In [5]:
import bokeh
import numpy

class PlotHisto:
    
    @staticmethod
    def init_figure(tools = None,logscale=False):
        """
        """
        from bokeh.plotting import figure
        from bokeh.models.tools import  PanTool,\
                                        BoxZoomTool,\
                                        WheelZoomTool,\
                                        ResizeTool,\
                                        ResetTool,\
                                        HelpTool
        from bokeh.models.tools import  Tool
        TOOLS = [PanTool(),BoxZoomTool(),WheelZoomTool(),ResizeTool(),ResetTool(),HelpTool()]
        if tools:
            if not isinstance(tools,(list,tuple)):
                tools = [tools]
            for t in tools:
                if not issubclass(t.__class__,Tool):
                    continue
                TOOLS.append(t)
        #TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'
        y_type = 'log' if logscale else 'linear'
        fig = figure(tools=TOOLS,y_axis_type=y_type)
        return fig
        
    @staticmethod
    def bar(counts,bins,label,color,alpha=0.5,figure=None):
        """
        """
        bottoms = [0]*len(counts)
        figure = PlotHisto.bar_stacked(tops=counts,
                                       bottoms=bottoms,
                                       bins=bins,
                                       label=label,
                                       color=color,
                                       alpha=alpha,
                                       figure=figure)
        return figure
    
    @staticmethod
    def bar_stacked(tops,bottoms,bins,label,color,alpha=None,figure=None):
        """
        """
        figure = PlotHisto._bar(tops,bottoms,bins[:-1],bins[1:],
                               label,color,alpha=alpha,figure=figure)
        return figure
    
    @staticmethod
    def bar_adjacent(counts,lefts,rights,label,color,alpha=None,figure=None):
        """
        """
        bottoms = [0]*len(counts)
        figure = PlotHisto._bar(counts,bottoms,lefts,rights,
                               label,color,alpha=alpha,figure=figure)
        return figure
        
    @staticmethod
    def _bar(tops,bottoms,lefts,rights,label,color,alpha,figure):
        """
        """
        assert len(tops)==len(bottoms) or isinstance(bottoms,(int,float))
        assert len(lefts)==len(rights)

        if figure is None:
            figure = init_figure()
            
        figure.quad(top=tops,
                    bottom=bottoms,
                    left=lefts,
                    right=rights,
                    fill_color=color,fill_alpha=alpha,
                    legend=label)
        return figure
        
    @staticmethod
    def step(counts,bins,label,color,figure):
        """
        """
        _y,_x = histogram2stepfunction(counts,bins)
        figure.line(x=_x,
                    y=_y,
                    #line_color="#D95B43",line_width=2,
                    line_color=color,line_width=2,
                    legend='label')

        import numpy as np
        _x = np.diff(bins)/2+bins[:-1]
        figure.circle(x=_x,
                      y=_y,
                      #line_color="#D95B43",line_width=2,
                      line_color=color,line_width=2,
                      fill_color="white",fill_alpha=1,
                      size=9,
                      legend='label')
        return figure
    

    @staticmethod
    def multihist(df,column,groupby,nbins=10,mode='over',logscale=False):
        """
        mode = [over,stacked,adjacent]
        """
        from bokeh.models import CrosshairTool,HoverTool
        
        import numpy as np

        fig = PlotHisto.init_figure(tools=[CrosshairTool(),HoverTool()],logscale=logscale)

        label = column
        
        fig.select(CrosshairTool).dimensions = ['height']
        fig.select(HoverTool).mode = 'hline'
        label_tip = "{}: ".format(label)
        fig.select(HoverTool).tooltips = [(label_tip,"$x")]

        fig.xgrid.grid_line_color = None            #vgrid
        fig.ygrid.minor_grid_line_color = 'gray'    #hgrid
        fig.ygrid.minor_grid_line_alpha = 0.5
        fig.ygrid.minor_grid_line_dash = 'dashed'

        fig.yaxis.axis_label = 'Counts'
        fig.xaxis.axis_label = label
        
        bins = Distro.binning(df[column],nbins,spacing='linear')
        
        groups = df.groupby(groupby)
        ngroups = len(groups)
        colors = Colors.get_colors(ngroups)
        
        counts_last = [0]*(len(bins)-1)
        for i,grp_key in enumerate(groups.indices.keys()):
            _data = df.loc[groups.groups[grp_key],(column)]
            _counts,_bins = Distro.histogram(_data,bins)
            
            if mode is 'stacked':
                _counts = [ _counts[_i]+_bot for _i,_bot in enumerate(counts_last) ]
                fig = PlotHisto.bar_stacked(_counts,counts_last,_bins,str(i),colors[i],figure=fig)
                
            elif mode is 'adjacent':
                bins_left,bins_right = [],[]
                for _i in range(len(bins)-1):
                    _binsplit = np.linspace(bins[_i],bins[_i+1],ngroups+1)
                    bins_left.append(_binsplit[i])
                    bins_right.append(_binsplit[i+1])
                fig = PlotHisto.bar_adjacent(_counts,bins_left,bins_right,str(i),colors[i],figure=fig)

            else:
                fig = PlotHisto.bar(_counts,bins,str(i),colors[i],alpha=float(1)/ngroups,figure=fig)
            last_counts = _counts[:]
            #else:
            #    _data = df.loc[grp.groups[grp_key],(column)]
            #    _counts,_bins = Distro.histogram(_data,bins)
            #    fig = PlotHisto.step(_counts,bins,i,fig)
        
        return fig

In [6]:
class PlotBox:
    
    @staticmethod
    def boxplot(df,column,by,mean=True):
        """
        """
        from bokeh.plotting import figure
        from bokeh.models import Range1d
        
        if mean:
            df['mean'] = df.groupby(by)[column].transform(lambda x:x-x.mean())
        groups = df.groupby(by)
        if mean:
            groups = groups['mean']
        else:
            groups = groups[column]

        # Now a 'try' statement because I don't want to get in to the details when I'm not using Pandas.Categories.
        try:
            # When the 'by' argument is a DF.Categorical instance, we have to do some transforming.
            # Notice that this Categorical (group) labels are strings containing the range of each bin,
            #  so that we get from them the "left,right" values for the 'bins' to use.
            import re
            _bins = set([ re.sub(r'[^\d.]+','',s) for c in df[by].values.categories for s in c.split(',') ])
            _bins = list(_bins)
            _bins.sort()
            _bins = np.asarray(_bins,dtype=np.float)
        except:
            # Now, when I am not using Categories. In particular, now I'm passing "by" as a vector/Series
            # containing numerical labels
            assert False
        _diff = np.diff(_bins)
        _center = _bins[:-1] + _diff/2

        # Generate some synthetic time series for six different categories
        cats = [ s for s,g in groups ]

        # Find the quartiles and IQR foor each category
        q1 = groups.quantile(q=0.25)
        q2 = groups.quantile(q=0.5)
        q3 = groups.quantile(q=0.75)
        iqr = q3 - q1
        upper = q3 + 1.5*iqr
        lower = q1 - 1.5*iqr

        # find the outliers for each category
        def outliers(group):
            cat = group.name
            return group[(group > upper.loc[cat][0]) | (group < lower.loc[cat][0])]
        out = groups.apply(outliers).dropna()

        # Prepare outlier data for plotting, we need coordinate for every outlier.
        outx = []
        outy = []
        for i,cat in enumerate(cats):
            # only add outliers if they exist
            if not out.loc[cat].empty:
                for value in out[cat]:
                    outx.append(_center[i])
                    outy.append(value)

        p = figure(title="")

        from bokeh.models import FixedTicker
        p.x_range = Range1d(_bins.min(),_bins.max())
        p.xaxis.ticker = FixedTicker(ticks=_center)

        # If no outliers, shrink lengths of stems to be no longer than the minimums or maximums
        qmin = groups.quantile(q=0.00)
        qmax = groups.quantile(q=1.00)
        upper = [ min([x,y]) for (x,y) in zip(qmax,upper) ]
        lower = [ max([x,y]) for (x,y) in zip(qmin,lower) ]

        # stems
        p.segment(_center, upper, _center, q3, line_width=2, line_color="black")
        p.segment(_center, lower, _center, q1, line_width=2, line_color="black")

        # boxes
        p.rect(_center, (q3+q2)/2, _diff/2, q3-q2,    fill_color="#E08E79", line_width=2, line_color="black")
        p.rect(_center, (q2+q1)/2, _diff/2, q2-q1,    fill_color="#3B8686", line_width=2, line_color="black")

        # whiskers (almost-0 height rects simpler than segments)
        p.rect(_center, lower, _diff/4, 0.002, line_color="black")
        p.rect(_center, upper, _diff/4, 0.002, line_color="black")

        # outliers
        p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

        p.xgrid.grid_line_color = None
        p.ygrid.minor_grid_line_color = 'gray'
        p.ygrid.minor_grid_line_alpha = 0.5
        p.ygrid.minor_grid_line_dash = 'dashed'

        p.xaxis.major_label_text_font_size="12pt"
        p.xaxis.major_label_orientation = -3.14/2

        p.xaxis.axis_label = by
        p.yaxis.axis_label = column if not mean else column + ' (0-mean)'

        return p

2dF zcat overview

The catalog we're going to play here is the "best.observations.idz" spectroscopic catalogue.

Before proceeding, let me quote the website's ASCII catalogues section: "These ASCII catalogues are provided for the convenience of users of the 2dFGRS Final Data Release. However it is important to note that the mSQL database is the definitive data source, and contains all the information in the ASCII catalogues and much additional data."

That said, we can go to the data...ok, not so soon. We have to deal with the (ascii) catalogue and its columns format first. The best.observation.idz columns description summary is the following:

#name     type    description
serial    I6      Database serial number (=SEQNUM)
spectra   I1      Number of spectra obtained
name      A10     2dFGRS name (=NAME)
UKST      A3      UKST plate (=IFIELD)
ra        A11     R.A. (B1950)
dec       A11     Dec. (B1950)
ra2000    A11     R.A. (J2000)
dec2000   A11     Dec. (J2000)
BJG       F6.3    Final bj magnitude without extinction correction
BJSEL     F6.3    Final bj magnitude with extinction correction
BJG_OLD   F6.3    Original bj magnitude without extinction correction
BJSELOLD  F6.3    Original bj magnitude with extinction correction
GALEXT    F5.3    Galactic extinction value
SB_BJ     F6.3    SuperCosmos bj magnitude without extinction correction
SR_R      F6.3    SuperCosmos R magnitude without extinction correction
z         F9.6    Best redshift (observed)
z_helio   F9.6    Best redshift (heliocentric)
obsrun    A5      Observation run of best spectrum
quality   I1      Redshift quality parameter for best spectrum (reliable redshifts have quality>=3)
abemma    I1      Redshift type (abs=1, emi=2, man=3)
Z_ABS     F9.6    Cross-correlation redshift from best spectrum
KBESTR    I1      Cross-correlation template from best spectrum
R_CRCOR   F5.3    Cross-correlation R value from best spectrum
Z_EMI     F9.6    Emission redshift from best spectrum
NMBEST    I2      Number of emission lines for Z_EMI from best spectrum
SNR       F6.2    Median S/N per pixel from best spectrum
ETA_TYPE  F10.6   Eta spectral type parameter from best spectrum (-99.9 if none)

The catalogue (data file) itself has no header (i.e, column names).

So, after adding the column names header line, fixing the positional (ra,dec) columns (before: hh mm ss, after: hh:mm:ss) and subsituting the whitespaces to semicolons we have the new version of the catalogue, named best.observations.idz.csv, which the first 5 lines -- 2 header, 3 data -- are the following:

#I6;I1;A10;A3;A11;A11;A11;A11;F6.3;F6.3;F6.3;F6.3;F5.3;F6.3;F6.3;F9.6;F9.6;A5;I1;I1;F9.6;I1;F5.3;F9.6;I2;F6.2;F10.6
serial;spectra;name;UKST;ra;dec;ra2000;dec2000;BJG;BJSEL;BJG_OLD;BJSELOLD;GALEXT;SB_BJ;SR_R;z;z_helio;obsrun;quality;abemma;Z_ABS;KBESTR;R_CRCOR;Z_EMI;NMBEST;SNR;ETA_TYPE
1;2;TGS436Z001;349;00:11:55.72;-32:32:55.2;00:14:27.05;-32:16:14.6;19.424;19.362;19.430;19.390;0.062;19.368;18.286;0.2981;0.2981;01SEP;4;1;0.2981;5;4.5700;0.2984;1;3.8;-99.90000
2;1;TGS496Z001;349;00:11:59.29;-33:14:41.3;00:14:30.55;-32:58:00.7;18.842;18.789;18.870;18.840;0.053;18.688;17.291;0.1229;0.1228;01OCT;5;1;0.1229;1;14.3800;-9.9990;0;47.6;-2.58920
3;1;TGS435Z001;349;00:11:49.37;-32:39:57.4;00:14:20.71;-32:23:16.8;18.320;18.265;18.350;18.310;0.055;18.336;17.138;0.1038;0.1038;01SEP;4;1;0.1038;1;9.3800;0.1032;1;28.4;-2.46500

This is the file we are going to deal now.


In [7]:
# The reading of this catalogue could be done either using Pandas or Astropy.
# To have some homogeneity (looking into the future) let's do it with Astropy.
import astropy
from astropy.io import ascii
tab = ascii.read('best.observations.idz.csv',
                 format='csv', delimiter=';',
                 header_start=1)

In [8]:
# Summary
tab.info


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/astropy/table/column.py:268: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return self.data.__eq__(other)
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/astropy/table/info.py:94: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if np.all(info[name] == ''):
Out[8]:
<Table length=245591>
  name    dtype 
-------- -------
  serial   int64
 spectra   int64
    name   str10
    UKST   int64
      ra   str11
     dec   str11
  ra2000   str11
 dec2000   str11
     BJG float64
   BJSEL float64
 BJG_OLD float64
BJSELOLD float64
  GALEXT float64
   SB_BJ float64
    SR_R float64
       z float64
 z_helio float64
  obsrun    str5
 quality   int64
  abemma   int64
   Z_ABS float64
  KBESTR   int64
 R_CRCOR float64
   Z_EMI float64
  NMBEST   int64
     SNR float64
ETA_TYPE float64

Since the info provided regards the columns and data types, let's see what Pandas gives to us.


In [9]:
import pandas
df = tab.to_pandas()

In [10]:
pandas.set_option('display.max_columns',25)
pandas.set_option('display.width',1000)
df.describe()


Out[10]:
serial spectra UKST BJG BJSEL BJG_OLD BJSELOLD GALEXT SB_BJ SR_R z z_helio quality abemma Z_ABS KBESTR R_CRCOR Z_EMI NMBEST SNR ETA_TYPE
count 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.00000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000 245591.000000
mean 189248.420280 1.099132 570.171883 18.640340 18.532736 18.746687 18.668368 0.107604 18.625657 17.496698 -0.03712 -0.037142 4.027273 1.266260 0.000272 3.374509 6.403765 -1.494766 1.723813 15.747094 -21.349351
std 110943.954708 0.324562 215.249118 0.770491 0.773140 0.764504 0.763645 0.057304 0.955172 1.017280 1.16079 1.160787 0.795907 0.516379 1.041420 2.043343 3.778627 3.738832 1.384722 13.700003 41.108474
min 1.000000 1.000000 75.000000 13.558000 13.348000 14.050000 14.010000 0.004000 0.000000 0.000000 -9.00000 -9.000000 1.000000 1.000000 -9.999000 0.000000 -9.990000 -9.999000 0.000000 0.000000 -99.900000
25% 90888.500000 1.000000 410.000000 18.292000 18.182500 18.440000 18.360000 0.067000 18.282000 17.091000 0.06800 0.068000 4.000000 1.000000 0.063100 2.000000 3.890000 0.056900 1.000000 7.800000 -3.220400
50% 182547.000000 1.000000 476.000000 18.856000 18.748000 18.970000 18.900000 0.090000 18.853000 17.643000 0.10770 0.107700 4.000000 1.000000 0.106700 3.000000 5.670000 0.124500 1.000000 12.400000 -1.576600
75% 289966.500000 1.000000 791.000000 19.201000 19.095000 19.290000 19.220000 0.135000 19.205000 18.084000 0.14930 0.149300 5.000000 1.000000 0.149650 5.000000 8.340000 0.196200 2.000000 19.400000 0.921300
max 389712.000000 6.000000 892.000000 19.848000 19.613000 19.850000 19.500000 0.628000 21.286000 21.731000 3.50000 3.499900 5.000000 3.000000 0.349700 8.000000 39.820000 0.656800 7.000000 450.000000 72.892300

In [11]:
df.describe(include=['O'])


Out[11]:
name ra dec ra2000 dec2000 obsrun
count 245591 245591 245591 245591 245591 245591
unique 245591 236954 215195 237939 215158 33
top TGS491Z271 23:52:44.48 -01:46:34.0 12:17:37.59 -29:17:53.2 00SEP
freq 1 4 6 4 6 22396

Positions: sexagesimal to degree

First of all, let's translate the sexagesimal positions to degree to have simpler quantities; ones that we can plot, for example.


In [12]:
class Coords:
    @staticmethod
    def sex2deg(ra,dec,frame='icrs'):
        from astropy.coordinates import SkyCoord
        from astropy import units
        c = SkyCoord(ra=ra, dec=dec,
                     unit=(units.hourangle,units.deg),
                     frame=frame)
        return c.ra.value,c.dec.value

In [13]:
# Since I am not interested on B1950 coordinates, I will just substitute those values with the J2000 in degrees

# First, though, let me copy the table
data = df.copy()

In [14]:
#data.ra,data.dec = Coords.sex2deg(data.ra2000,data.dec2000)

# And let me update the original(df) table with these values, just for the records ;)
#df.ra,df.dec = data.ra,data.dec

Redshift and their flags

Let us see what are the properties of the (best) redshift distribution, and the relations with corresponding flags.


In [15]:
data = df[['z','quality','abemma']]
data.describe()


Out[15]:
z quality abemma
count 245591.00000 245591.000000 245591.000000
mean -0.03712 4.027273 1.266260
std 1.16079 0.795907 0.516379
min -9.00000 1.000000 1.000000
25% 0.06800 4.000000 1.000000
50% 0.10770 4.000000 1.000000
75% 0.14930 5.000000 1.000000
max 3.50000 5.000000 3.000000

In [16]:
%matplotlib inline


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

The redshift only distribution


In [17]:
p = PlotDistro.boxhist(data,'z')


With the description summary and the plots above we see there are some "Null" values (i.e, "-9") in our data we were not aware of before (at least, I wasn't). Let's remove them by giving "None" to them.


In [18]:
_znil = data.z==-9
data.ix[_znil,'z'] = None


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/pandas/core/indexing.py:426: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

In [19]:
data.describe()


Out[19]:
z quality abemma
count 241554.000000 245591.000000 245591.000000
mean 0.112673 4.027273 1.266260
std 0.070212 0.795907 0.516379
min -0.087500 1.000000 1.000000
25% 0.070200 4.000000 1.000000
50% 0.108600 4.000000 1.000000
75% 0.150200 5.000000 1.000000
max 3.500000 5.000000 3.000000

In [20]:
p = PlotDistro.boxhist(data,'z',bins=20,log=True)


The redshift vs flags distributions


In [21]:
p = data.groupby('quality').z.plot.hist(bins=20,stacked=True)


[PLOT] A multi-distribution plot

Here we need to overplot multiple distributions. Let's define such plot function.


In [22]:
from bokeh.io import output_notebook, show, output_file
output_notebook()


Loading BokehJS ...

In [23]:
pq = PlotHisto.multihist(data,column='z',groupby='quality',nbins=30,mode='adjacent',logscale=True)
show(pq)


Out[23]:

<Bokeh Notebook handle for In[23]>


In [24]:
pa = PlotHisto.multihist(data,column='z',groupby='abemma',nbins=30,mode='adjacent',logscale=True)
show(pa)


Out[24]:

<Bokeh Notebook handle for In[24]>

The redshift vs magnitudes distribution

The best catalog provides bj and R magnitudes, from the SuperCosmos[?]. There are also some other columns expressing different "versions" of magnitude bj:

BJG       :    Final bj magnitude without extinction correction
BJSEL     :    Final bj magnitude with extinction correction
BJG_OLD   :    Original bj magnitude without extinction correction
BJSELOLD  :    Original bj magnitude with extinction correction
GALEXT    :    Galactic extinction value
SB_BJ     :    SuperCosmos bj magnitude without extinction correction
SR_R      :    SuperCosmos R magnitude without extinction correction

For the time being -- since I don't really what are all does mags -- let me focus on SB_BJ and SR_R.


In [25]:
data['SB_BJ'] = df['SB_BJ']
data['SR_R'] = df['SR_R']
data.describe()


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[25]:
z quality abemma SB_BJ SR_R
count 241554.000000 245591.000000 245591.000000 245591.000000 245591.000000
mean 0.112673 4.027273 1.266260 18.625657 17.496698
std 0.070212 0.795907 0.516379 0.955172 1.017280
min -0.087500 1.000000 1.000000 0.000000 0.000000
25% 0.070200 4.000000 1.000000 18.282000 17.091000
50% 0.108600 4.000000 1.000000 18.853000 17.643000
75% 0.150200 5.000000 1.000000 19.205000 18.084000
max 3.500000 5.000000 3.000000 21.286000 21.731000

In [26]:
_nil = data.z==-9
print('Number of z == -9 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.z[_nil] = None


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/pandas/core/generic.py:4059: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
Number of z == -9 entries: 0
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2885: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)

In [27]:
_nil = data.SB_BJ==0
print('Number of SB_BJ == 0 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.SB_BJ[_nil] = None


Number of SB_BJ == 0 entries: 218
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [28]:
_nil = data.SR_R==0
print('Number of SR_R == 0 entries: {}'.format(_nil.sum()))
#print 'Indexes where it is "-9":',np.where(_nil)
data.SR_R[_nil] = None


Number of SR_R == 0 entries: 276
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [29]:
describe(data,showNullIndexes=False)


                   z        quality         abemma          SB_BJ           SR_R
count  241554.000000  245591.000000  245591.000000  245373.000000  245315.000000
mean        0.112673       4.027273       1.266260      18.642205      17.516383
std         0.070212       0.795907       0.516379       0.777608       0.831389
min        -0.087500       1.000000       1.000000      12.193000      11.688000
25%         0.070200       4.000000       1.000000      18.284000      17.094000
50%         0.108600       4.000000       1.000000      18.853000      17.644000
75%         0.150200       5.000000       1.000000      19.205000      18.085000
max         3.500000       5.000000       3.000000      21.286000      21.731000

-> Has Nil? (How many?)
z          4037
quality       0
abemma        0
SB_BJ       218
SR_R        276
dtype: int64

In [30]:
nbins=20

In [31]:
plt = PlotDistro.boxhist(data,column='SB_BJ',bins=nbins)



In [32]:
plt = PlotDistro.boxhist(data,column='SR_R',bins=20)



In [33]:
pbj = PlotHisto.multihist(data,column='SB_BJ',groupby='quality',mode='adjacent')
show(pbj)


Out[33]:

<Bokeh Notebook handle for In[33]>


In [34]:
pr = PlotHisto.multihist(data,column='SR_R',groupby='quality',mode='adjacent')
show(pr)


Out[34]:

<Bokeh Notebook handle for In[34]>


In [35]:
import pandas as pd
import numpy as np

mag_min = min(data['SR_R'].min(),data['SB_BJ'].min())
mag_max = max(data['SR_R'].max(),data['SB_BJ'].max())
mag_bins = np.linspace(mag_min,mag_max,nbins)

In [36]:
data['bins_SR'] = pd.cut(data['SR_R'],mag_bins)
p_boxplot_SR = PlotBox.boxplot(data,column='SR_R',by='bins_SR')


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/pandas/core/generic.py:1517: FutureWarning: slice indexers when using iloc should be integers and not floating point
  result = self.iloc[loc]

In [37]:
data['bins_SB'] = pd.cut(data['SB_BJ'],mag_bins)
p_boxplot_SB = PlotBox.boxplot(data,column='SB_BJ',by='bins_SB')


/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/chbrandt/.conda/envs/photoz/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [38]:
def hist_compare(df,columns,bins,logscale=False):
    from bokeh.plotting import figure
    from bokeh.models import CrosshairTool,HoverTool
    
    TOOLS = 'pan,box_zoom,wheel_zoom,crosshair,hover,resize,reset'

    if logscale:
        p_hists = figure(tools=TOOLS,y_axis_type='log')
    else:
        p_hists = figure(tools=TOOLS)
    p_hists.xgrid.grid_line_color = None
    p_hists.ygrid.minor_grid_line_color = 'gray'
    p_hists.ygrid.minor_grid_line_alpha = 0.5
    p_hists.ygrid.minor_grid_line_dash = 'dashed'

    p_hists.select(CrosshairTool).dimensions = ['height']

    p_hists.select(HoverTool).mode = 'hline'
    p_hists.select(HoverTool).tooltips = [("mag: ","$x")]

    x_label = columns[0]
    y_label = columns[1]

    x = df[x_label]
    y = df[y_label]

    hs,b = np.histogram(x,bins=bins,normed=False)

    p_hists.quad(top=hs,
                   bottom=0,
                   left=bins[:-1],
                   right=bins[1:],
                   fill_color="#036564",fill_alpha=0.5,
                   legend=x_label)

    hp,b = np.histogram(y,bins=bins,normed=False)

    hh,bb = histogram2stepfunction(hp,b)

    p_hists.line(x=bb,
                   y=hh,
                   line_color="#D95B43",line_width=2,
                   legend=y_label)

    _b = np.diff(bins)/2+bins[:-1]
    p_hists.circle(x=_b,
                     y=hp,
                     size=9,line_color="#D95B43",line_width=2,
                     fill_color="white",fill_alpha=1,
                     legend=y_label)

    p_hists.yaxis.axis_label = 'Counts'

    return p_hists

In [39]:
p_hists = hist_compare(data,['SB_BJ','SR_R'],mag_bins,logscale=False)

In [40]:
p_hists.plot_height = 400

p_boxplot_SR.tools = p_hists.tools
p_boxplot_SR.x_range = p_hists.x_range
p_boxplot_SR.plot_height = p_hists.plot_height/2
p_boxplot_SR.xaxis.axis_label = None
p_boxplot_SR.ygrid.minor_grid_line_color = None

p_boxplot_SB.tools = p_hists.tools
p_boxplot_SB.x_range = p_hists.x_range
p_boxplot_SB.plot_height = p_hists.plot_height/2
p_boxplot_SB.xaxis.axis_label = None
p_boxplot_SB.ygrid.minor_grid_line_color = None

from bokeh.plotting import gridplot
p = gridplot([[p_hists],[p_boxplot_SB],[p_boxplot_SR]])
show(p)


Out[40]:

<Bokeh Notebook handle for In[40]>


In [41]:
from bokeh import mpl
data.boxplot(column='SB_BJ',by='quality')
show(mpl.to_bokeh())


Out[41]:

<Bokeh Notebook handle for In[41]>


In [42]:
from bokeh import mpl
data.boxplot(column='SR_R',by='quality')
show(mpl.to_bokeh())


Out[42]:

<Bokeh Notebook handle for In[42]>

Plotly

Let me try plotting the 3d (mag_r,mag_bj,z) using plotly


In [43]:
import plotly
print plotly.__version__
plotly.offline.init_notebook_mode()


1.9.6

In [44]:
import plotly.offline as plo

class Scatter3D:
    
    @staticmethod
    def plotly(df,columns):
        from plotly.graph_objs import Scatter3d,Data
        xl = columns[0]
        yl = columns[1]
        zl = columns[2]
        pts = Scatter3d(x=data[xl], y=data[yl], z=data[zl],
                       mode='markers',marker=dict(size=10,symbol='circle',opacity=0.5))
        p = Data([pts])
        return p
    
    @staticmethod
    def matplotlib(df,columns):
        import matplotlib
        from matplotlib import pyplot as plt
        from mpl_toolkits.mplot3d import Axes3D

        fig = plt.figure()
        ax = fig.add_subplot(111,projection='3d')

        xl = columns[0] ; xs = df[xl]
        yl = columns[1] ; ys = df[yl]
        zl = columns[2] ; zs = df[zl]
        
        ax.scatter(xs,ys,zs, marker='o')
        ax.set_xlabel(xl)
        ax.set_ylabel(yl)
        ax.set_zlabel(zl)

        return plt

In [45]:
#plo.plot(scatter3d(data,['z','SB_BJ','SR_R']))

In [46]:
# Let's sample data then...
data_sample = data.sample(frac=0.1)[['z','SB_BJ','SR_R']]
describe(data_sample)


                  z         SB_BJ          SR_R
count  24144.000000  24530.000000  24522.000000
mean       0.112926     18.639922     17.512557
std        0.068442      0.781972      0.836381
min       -0.009500     13.286000     12.144000
25%        0.071300     18.280000     17.087000
50%        0.108700     18.851000     17.643000
75%        0.149700     19.205000     18.088000
max        3.090000     21.067000     20.922000

-> Has Nil? (How many?)
z        415
SB_BJ     29
SR_R      37
dtype: int64

In [47]:
#plo.iplot(Scatter3D.plotly(data_sample,['z','SB_BJ','SR_R']))

Matplotlib

That was heavy stuff. Let me try with the old-good matplotlib


In [48]:
#%matplotlib notebook
%matplotlib inline
plt = Scatter3D.matplotlib(data,['z','SB_BJ','SR_R'])



In [49]:
class PlotScatter:
    
    @staticmethod
    def scatter_color(df,x_column,y_column,color_column,color_logscale=False):
        
        from bokeh.plotting import figure
        from bokeh.models import Range1d
        
        title = "{0} .vs. {1}".format(x_column,y_column)

        TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"

        # create the scatter plot
        p = figure(title = title,
                   min_border = 10,
                   min_border_left = 50,
                   tools=TOOLS)

        p.xaxis.axis_label = x_column
        xmin = df[x_column].min()
        xmax = df[x_column].max()
        p.x_range = Range1d(start = xmin,
                            end = xmax)

        p.yaxis.axis_label = y_column
        ymin = df[y_column].min()
        ymax = df[y_column].max()
        p.y_range = Range1d(start = ymin,
                            end = ymax)

        def scalar2color(vector,logscale=False):
            import numpy as np
            if logscale:
                vector = np.log(1+vector)
            vmin = vector.min()
            vmax = vector.max()
            vfac = 255.0/(vmax-vmin)
            colors = []
            for v in vector:
                if np.isnan(v):
                    clr = '#000000'
                else:
                    r = int((v-vmin)*vfac)
                    g = int(255-((v-vmin)*vfac))
                    b = 0
                    clr = "#%02x%02x%02x" % (r,g,b)
                colors.append(clr)
            return colors
        
        xs = df[x_column]
        ys = df[y_column]
        colors = scalar2color(df[color_column],color_logscale)
        p.scatter(xs, ys, size=5, fill_color=colors, fill_alpha=0.5, line_color=None)
        
        return p

In [50]:
#data_sample = data_sample.sort_values(by=['z'],ascending=True)
p = PlotScatter.scatter_color(data_sample,'SB_BJ','SR_R','z')
show(p)


Out[50]:

<Bokeh Notebook handle for In[50]>

Binning the data

One thing that should be done is to define ways of spliting -- or binning -- a data vector. For example, last plot is hard to visualize the z (i.e, colors) coordinate -- because it's a dense plot, I have thought about level curves, which should clarify the visualization. To do so, we have to define what are the levels we want to define in our data (here, z).

I have to move on on regression now, but here some links about density plots and contours. I read those links because I want to create a "SB_BJ .vs. SR_R" scatter plot smoothed on "z" values, as if I had an surface image, where the height of this surface-image would be proportional of the values of 'z' there. (see the "contours notebook") for motivation/memory.

Regression

I want to explore now how can I fit the z-mag space -- even if I don't know how much corelated the data are. Yes, this is a very preliminary -- or imature -- tentative. Well, let me see what comes out.


In [51]:
data.columns


Out[51]:
Index([u'z', u'quality', u'abemma', u'SB_BJ', u'SR_R', u'bins_SR', u'mean', u'bins_SB'], dtype='object')

In [52]:
data = data[[u'z', u'quality', u'abemma', u'SB_BJ', u'SR_R']]

In [53]:
describe(data)


                   z        quality         abemma          SB_BJ           SR_R
count  241554.000000  245591.000000  245591.000000  245373.000000  245315.000000
mean        0.112673       4.027273       1.266260      18.642205      17.516383
std         0.070212       0.795907       0.516379       0.777608       0.831389
min        -0.087500       1.000000       1.000000      12.193000      11.688000
25%         0.070200       4.000000       1.000000      18.284000      17.094000
50%         0.108600       4.000000       1.000000      18.853000      17.644000
75%         0.150200       5.000000       1.000000      19.205000      18.085000
max         3.500000       5.000000       3.000000      21.286000      21.731000

-> Has Nil? (How many?)
z          4037
quality       0
abemma        0
SB_BJ       218
SR_R        276
dtype: int64

Scikit-learn

I'll use scikit-learn regression algorithms to try it out. Those are documented on the 1st chapter of the documentation.

I will start with some famous in our community, for simplicity:

Data sample


In [54]:
data = data.dropna()
describe(data)


                   z        quality         abemma          SB_BJ           SR_R
count  241341.000000  241341.000000  241341.000000  241341.000000  241341.000000
mean        0.112713       4.076846       1.258294      18.634805      17.507366
std         0.070192       0.703219       0.508247       0.778661       0.830622
min        -0.087500       1.000000       1.000000      12.193000      11.688000
25%         0.070200       4.000000       1.000000      18.276000      17.085000
50%         0.108700       4.000000       1.000000      18.846000      17.636000
75%         0.150200       5.000000       1.000000      19.200000      18.076000
max         3.500000       5.000000       3.000000      20.603000      21.731000

-> Has Nil? (How many?)
z          0
quality    0
abemma     0
SB_BJ      0
SR_R       0
dtype: int64

In [55]:
train = data.sample(frac=0.8)
describe(train)


                   z        quality         abemma          SB_BJ           SR_R
count  193073.000000  193073.000000  193073.000000  193073.000000  193073.000000
mean        0.112617       4.076707       1.258633      18.636240      17.509021
std         0.070018       0.704443       0.509218       0.777729       0.830143
min        -0.087500       1.000000       1.000000      12.193000      11.781000
25%         0.070200       4.000000       1.000000      18.278000      17.086000
50%         0.108700       4.000000       1.000000      18.847000      17.637000
75%         0.150000       5.000000       1.000000      19.200000      18.078000
max         3.500000       5.000000       3.000000      20.603000      21.493000

-> Has Nil? (How many?)
z          0
quality    0
abemma     0
SB_BJ      0
SR_R       0
dtype: int64

In [56]:
test = data.drop(train.index)
describe(test)


                  z       quality        abemma         SB_BJ          SR_R
count  48268.000000  48268.000000  48268.000000  48268.000000  48268.000000
mean       0.113099      4.077401      1.256940     18.629068     17.500745
std        0.070884      0.698311      0.504346      0.782360      0.832513
min       -0.057100      2.000000      1.000000     12.971000     11.688000
25%        0.070400      4.000000      1.000000     18.267000     17.082000
50%        0.108700      4.000000      1.000000     18.841000     17.629000
75%        0.150700      5.000000      1.000000     19.197250     18.071000
max        3.090000      5.000000      3.000000     20.407000     21.731000

-> Has Nil? (How many?)
z          0
quality    0
abemma     0
SB_BJ      0
SR_R       0
dtype: int64

Nearest neighbor

The nn algorithm requires we to defined the number of neighbors it should use and the kind of weighting to apply to each node-neighbor pair, 'uniform' or 'distance'.


In [57]:
kn = 100

In [58]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='uniform')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)

In [59]:
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)

In [60]:
def scatter_kde(x,y,x_label,y_label,lim_scatter=None):
    """
    """
    from bokeh.plotting import figure,gridplot
    from bokeh.models import Range1d
    
    title = "{} .vs. {}".format(x_label,y_label)

    TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"

    x_min,x_max = x.min(),x.max()
    y_min,y_max = y.min(),y.max()
    if lim_scatter is None:
        _min = min(x_min,y_min)
        _max = max(x_max,y_max)
        lim_scatter = [_min,_max]
        
    # create the scatter plot
    p_scatter = figure(title = title,
                       min_border = 10,
                       min_border_left = 50,
                       tools=TOOLS)
    
    p_scatter.x_range = Range1d(start = lim_scatter[0], end = lim_scatter[1])
    p_scatter.xaxis.axis_label = x_label
    
    p_scatter.y_range = Range1d(start = lim_scatter[0], end = lim_scatter[1])
    p_scatter.yaxis.axis_label = y_label

    p_scatter.line(lim_scatter, lim_scatter, color='black', line_width=2)

    p_scatter.scatter(x, y, size=3, color="#3A5785", alpha=0.5)


    # Kernel Density Estimates
    #
    def kde(values,vmin=None,vmax=None):
        import numpy as np
        from scipy.stats import gaussian_kde
        kern = gaussian_kde(values)
        vmin = vmin if vmin is not None else values.min()
        vmax = vmax if vmax is not None else values.max()
        _cov = kern.covariance[0][0]
        pnts = np.arange(vmin, vmax, 10*_cov)
        kde = kern(pnts)
        return kde,pnts


    # Create the HORIZONTAL plot (former histogram)
    #
    #LINE_ARGS = dict(color="#3A5785", line_color=None)
    #

    x_kde,x_grid = kde(x,x_min,x_max)
    hmax = x_kde.max()*1.1

    p_kde_spec = figure(title = None,
                        plot_width = p_scatter.plot_width,
                        plot_height = p_scatter.plot_height/3,
                        x_range = p_scatter.x_range,
                        y_range = (0, hmax),
                        min_border = 10,
                        min_border_left = 50,
                        toolbar_location = None,
                        tools=TOOLS)
    p_kde_spec.xgrid.grid_line_color = None
    #
    p_kde_spec.line(x_grid, x_kde)#,**LINE_ARGS)


    # Create the VERTICAL plot (former histogram)
    #
    th = 42 # need to adjust for toolbar height, unfortunately
    #

    y_kde,y_grid = kde(y,y_min,y_max)
    vmax = y_kde.max()*1.1
    
    p_kde_photo = figure(title = None,
                        plot_width = p_scatter.plot_width/3,
                        plot_height = p_scatter.plot_height+th-10,
                        x_range = (0, vmax),
                        y_range = p_scatter.y_range,
                        min_border = 10,
                        min_border_top = th,
                        toolbar_location = None,
                        tools=TOOLS)
    p_kde_photo.ygrid.grid_line_color = None
    #
    p_kde_photo.line(y_kde, y_grid)


    # Let's adjust the borders
    #
    p_kde_photo.min_border_top = 80
    p_kde_photo.min_border_left = 0
    p_kde_photo.min_border_bottom = 50

    p_kde_spec.min_border_top = 10
    p_kde_spec.min_border_right = 10
    p_kde_spec.min_border_left = 80

    p_scatter.min_border_right = 10
    p_scatter.min_border_left = 80
    p_scatter.min_border_bottom = 50


    # Arrange them (the plots) to a regular grid
    #
    layout = gridplot([[p_scatter,p_kde_photo],[p_kde_spec,None]])

    return layout

In [61]:
from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)


Out[61]:

<Bokeh Notebook handle for In[61]>


In [62]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='distance')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)

In [63]:
from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)


Out[63]:

<Bokeh Notebook handle for In[63]>

I just realized the data has some bad quality redshift, quality <= 2. We should clean it also.


In [65]:
rows_good = data.quality >= 3
good = data.loc[rows_good]
describe(good)


                   z        quality         abemma         SB_BJ           SR_R
count  233075.000000  233075.000000  233075.000000  233075.00000  233075.000000
mean        0.112173       4.150505       1.236130      18.61999      17.489441
std         0.069276       0.594673       0.477284       0.78223       0.831475
min        -0.057100       3.000000       1.000000      12.19300      11.688000
25%         0.070400       4.000000       1.000000      18.25600      17.066000
50%         0.108400       4.000000       1.000000      18.83000      17.618000
75%         0.149000       5.000000       1.000000      19.18900      18.059000
max         3.500000       5.000000       3.000000      20.56300      21.731000

-> Has Nil? (How many?)
z          0
quality    0
abemma     0
SB_BJ      0
SR_R       0
dtype: int64

In [66]:
train = good.sample(frac=0.8)
test = good.drop(train.index)

In [67]:
from sklearn.neighbors import KNeighborsRegressor as KNR
knn = KNR(kn,weights='distance')
X = train[['SB_BJ','SR_R']]
y = train.z
train_fit = knn.fit(X,y)
X_test = test[['SB_BJ','SR_R']]
y_predict = train_fit.predict(X_test)

from bokeh.plotting import show
p = scatter_kde(test.z,y_predict,'z-real','z-estimated')
show(p)


Out[67]:

<Bokeh Notebook handle for In[67]>


In [70]:
def scatter_hists(x,y,x_label,y_label,lim_scatter=None):
    """
    """
    
    from bokeh.plotting import figure,gridplot
    from bokeh.models import Range1d,BoxSelectTool,LassoSelectTool
    import numpy as np
    
    title = "{} .vs. {}".format(x_label,y_label)

    TOOLS="pan,wheel_zoom,box_select,lasso_select,box_zoom,reset"

    x_min,x_max = x.min(),x.max()
    y_min,y_max = y.min(),y.max()
    if lim_scatter is None:
        _min = min(x_min,y_min)
        _max = max(x_max,y_max)
        lim_scatter = [_min,_max]
    
    x_bins = np.linspace(_min,_max,100)
    y_bins = x_bins
    
    # create the scatter plot
    p_hist2d = figure(title=title,
                      x_range = lim_scatter,
                      y_range = lim_scatter,
                      plot_width=600,
                      plot_height=600,
                      min_border=10,
                      min_border_left=50,
                      tools=TOOLS)
    p_hist2d.select(BoxSelectTool).select_every_mousemove = False
    p_hist2d.select(LassoSelectTool).select_every_mousemove = False

    p_hist2d.xaxis.axis_label = x_label
    p_hist2d.yaxis.axis_label = y_label

    p_hist2d.line(lim_scatter, lim_scatter, color='light_gray', line_width=1)

    # We will not plot the usual colorful/heatmap-like histogram, but a size-scaled one
    # So the next steps are to compute the histogram-2d itself, then clean it (no zero-counte)
    #  and (re)define the (x,y) grid to plot the points (scaled by the histogram bins counts).
    #
    hist2d, x_edges, y_edges = np.histogram2d(y, x, bins=(x_bins,y_bins))
    assert np.array_equal(x_edges,x_bins)
    assert np.array_equal(y_edges,y_bins)

    # remove null bins
    hist2d_y_ind, hist2d_x_ind = np.where(hist2d>0)

    y_edges = y_edges[hist2d_y_ind]
    x_edges = x_edges[hist2d_x_ind]
    hist2d = hist2d[hist2d_y_ind, hist2d_x_ind]

    # give them a size; no science here, just the old "looks-good"...
    _hmin = hist2d.min()
    _hmax = hist2d.max()
    hist2d_scale = 2 + 10 * np.log2( (hist2d-_hmin) / (_hmax-_hmin) +1 )

    # a DataFrame makes life easier when using Bokeh
    #
    _df = pd.DataFrame({'spec' :x_edges,
                        'photo':y_edges,
                        'scale':hist2d_scale})

    r_hist2d = p_hist2d.square(x =    _df['spec'],
                               y =    _df['photo'],
                               size = _df['scale'],
                               color = "#3A5785",
                               alpha = 0.5)
    #s = p.scatter(x=xs, y=xp, size=1, color="black")


    # create the horizontal histogram
    x_hist, hedges = np.histogram(x, bins=x_bins,
                                 range=lim_scatter)
    hzeros = np.zeros(len(x_hist)-1)
    hmax = max(x_hist)*1.1

    LINE_ARGS = dict(color="#3A5785", line_color=None)

    p_hist_spec = figure(toolbar_location=None,
                         plot_width=p_hist2d.plot_width,
                         plot_height=200,
                         x_range=p_hist2d.x_range,
                         y_range=(-hmax, hmax),
                         title=None,
                         min_border=10,
                         min_border_left=50,
                         tools=TOOLS)
    p_hist_spec.xgrid.grid_line_color = None

    p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=x_hist, color="white", line_color="#3A5785")
    hh1 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
    hh2 = p_hist_spec.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)


    # create the vertical histogram
    y_hist, vedges = np.histogram(y, bins=y_bins,
                                   range=lim_scatter)
    vzeros = np.zeros(len(y_hist)-1)
    vmax = max(y_hist)*1.1

    th = 42 # need to adjust for toolbar height, unfortunately
    p_hist_photo = figure(toolbar_location=None,
                          plot_width=200,
                          plot_height=p_hist2d.plot_height+th-10,
                          x_range=(-vmax, vmax),
                          y_range=p_hist2d.y_range,
                          title=None,
                          min_border=10,
                          min_border_top=th,
                          tools=TOOLS)
    p_hist_photo.ygrid.grid_line_color = None
    p_hist_photo.xaxis.major_label_orientation = -3.14/2

    p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=y_hist, color="white", line_color="#3A5785")
    vh1 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
    vh2 = p_hist_photo.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)

    p_hist_photo.min_border_top = 80
    p_hist_photo.min_border_left = 0

    p_hist_spec.min_border_top = 10
    p_hist_spec.min_border_right = 10

    p_hist2d.min_border_right = 10

    layout = gridplot([[p_hist2d,p_hist_photo],
                       [p_hist_spec,None]])

    def update(attr, old, new):
        inds = np.array(new['1d']['indices'])
        if len(inds) == 0 or len(inds) == len(x_edges):
            hhist1, hhist2 = hzeros, hzeros
            vhist1, vhist2 = vzeros, vzeros
        else:
            neg_inds = np.ones_like(x, dtype=np.bool)
            neg_inds[inds] = False
            hhist1, _ = np.histogram(x[inds_x], bins=hedges)
            vhist1, _ = np.histogram(y[inds_xp], bins=vedges)
            hhist2, _ = np.histogram(x[neg_inds_x], bins=hedges)
            vhist2, _ = np.histogram(y[neg_inds_y], bins=vedges)

        hh1.data_source.data["top"]   =  hhist1
        hh2.data_source.data["top"]   = -hhist2
        vh1.data_source.data["right"] =  vhist1
        vh2.data_source.data["right"] = -vhist2

    r_hist2d.data_source.on_change('selected', update)

    return layout

In [71]:
from bokeh.plotting import show
p = scatter_hists(test.z,y_predict,'z-real','z-estimated')
show(p)


Out[71]:

<Bokeh Notebook handle for In[71]>


In [ ]: